Practice Lesson 2: Inductive Analytics

Packages

## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda.textstats)
library(gridExtra)
library(seededlda)
library(ggrepel)
library(ggdendro)
library(factoextra)
library(lattice)
library(spacyr)

Clean workspace and set working directory

## clean workspace
rm(list=ls())
## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_2/src/'
## you can also set it dynamically: 
## setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd(path)
## check that WD is set correctly
getwd()
## [1] "/Users/lucienbaumgartner/coliphi21/practice_lessons/lesson_2/src"

Import data

For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, we scraped the Stanford Encyclopedia of Philosophy and built a corpus including additional metadata.

## relative path
load('../input/stanford-encyclopedia.RDS')
## absolute path
load('~/coliphi21/practice_lessons/lesson_2/input/stanford-encyclopedia.RDS')

Disclaimer

Loading the data above will import a pre-built corpus object into R, which is called sfe.

Inspect data

## how does the corpus object look like?
sfe
## Corpus consisting of 1,712 documents and 21 docvars.
## 18thGerman-preKant.json :
## " In Germany, the eighteenth century was the age of enlighten..."
## 
## abduction.json :
## " In the philosophical literature, the term abduction is used..."
## 
## abelard.json :
## " Peter Abelard (1079–21 April 1142) [Abailard or Abaelard or..."
## 
## abhidharma.json :
## " The first centuries after Śākyamuni Buddha death saw the ri..."
## 
## abilities.json :
## " In the accounts we give of one another, claims about our ab..."
## 
## abner-burgos.json :
## " Abner of Burgos (Alfonso de Valladolid; c. 1260–1347) was p..."
## 
## [ reached max_ndoc ... 1,706 more documents ]
## summary statistics
summary(sfe) %>% head
## available variables
docvars(sfe)

> Exercise

Familiarize yourself a little more with the data.

Prep

## tokenization
toks <- tokens(sfe, what = 'word',
               remove_punct = T, remove_symbols = T, padding = F, 
               remove_numbers = T, remove_url = T)
## to lower
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks, 
                       pattern = lexicon::hash_lemmas$token, 
                       replacement = lexicon::hash_lemmas$lemma)
## remove stopwords
toks <- tokens_select(toks,  pattern = stopwords("en"), selection = "remove")
## remove noise
toks <- tokens_select(toks, pattern = '^[A-z]$|[0-9]+|^.$', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>% 
           dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                    max_docfreq = 0.1, docfreq_type = "prop")
dfm_sfe
## Document-feature matrix of: 1,712 documents, 24,689 features (98.47% sparse) and 21 docvars.
##                          features
## docs                      ethos immanuel thomasius pietist thomasians wolff well dis halle pietism
##   18thGerman-preKant.json     2        1        33       6         11    36    0   1    19       7
##   abduction.json              0        0         0       0          0     0    0   0     0       0
##   abelard.json                0        0         0       0          0     0    0   0     0       0
##   abhidharma.json             0        0         0       0          0     0    0   0     0       0
##   abilities.json              0        0         0       0          0     0    0   0     0       0
##   abner-burgos.json           0        0         0       0          0     0    0   0     0       0
## [ reached max_ndoc ... 1,706 more documents, reached max_nfeat ... 24,679 more features ]
topfeatures(dfm_sfe, n=200)
##               gene         disability            spinoza             turing           avicenna           buddhist          heidegger                ibn            chinese          spacetime          nietzsche            algebra              gödel            hilbert           einstein            husserl              dewey        computation               reid          molecular      supervenience               luck             peirce     intuitionistic             ockham                 du           cardinal               marx         simulation            privacy           brentano     utilitarianism          algorithm            ordinal               kuhn             neural              bacon         oppression             clarke              jones             monism         capability         conscience             tarski             racial             popper               weyl          fictional            african              trope             arabic             theism 
##               2017               1908               1765               1612               1553               1522               1512               1509               1450               1320               1283               1263               1262               1225               1222               1178               1174               1148               1127               1122               1113               1090               1084               1070               1056               1038               1024               1001                976                975                966                966                964                930                916                913                910                906                900                900                889                863                860                857                855                855                855                853                847                845                842                841 
##           coercion        inheritance             ramsey           averroes                dna                 ai            gravity           strawson        malebranche         domination        reichenbach               user       distributive             artist           artifact             darwin         relativism             scotus         maimonides          whitehead         pythagoras           feminism            climate                 ca   consequentialist                dao            goodman        physicalism          algebraic            fitness              grace           anderson             wright            diagram               hole              arrow             berlin                bce             searle              curve               folk           bayesian           molecule            bolzano               noun              fodor           boethius          newtonian        pythagorean              wolff          confucian               ball 
##                839                836                833                830                827                822                816                812                808                804                796                795                784                780                778                775                775                773                772                772                767                764                763                757                757                755                754                745                742                740                738                735                731                731                730                730                727                726                726                725                725                720                719                715                713                712                710                708                705                700                696                694 
##      demonstrative        pornography             matrix         parmenides            profile         evidential           buddhism        egalitarian           artistic             ritual          principia          recursive             cicero             albert            bradley       cosmological             austin          sovereign         adaptation               bell           japanese       conservation              nagel           chalmers           sidgwick            torture      deterministic           plotinus       architecture             cancer               sage              adams         liberalism            miracle         entailment           triangle            imagery            brouwer            dworkin             herder             christ                 ce             stream          scripture             desert           romantic              thick self-consciousness           electron             plural            collins             sartre 
##                694                693                691                687                683                673                673                670                664                658                657                655                652                651                649                648                644                644                643                642                641                641                638                637                637                636                631                628                627                626                625                622                621                618                616                614                614                611                610                608                607                607                607                605                604                603                601                596                595                595                595                595 
##           berkeley         naturalist          affective            atomism             nozick               twin              tense             statue          armstrong             update            proclus            entropy        temperature             income             buddha               bohr             payoff           chisholm              quale            instant              weber             salmon             metric          pluralist           coercive          celestial               poem      introspection               ross            bentham              graph         liberation          platonism        sovereignty               bois               node     constitutional          dynamical           mystical            slavery                gas            digital             vector          supervene 
##                593                593                592                592                591                590                589                586                583                579                579                576                575                575                574                574                573                573                573                570                570                570                569                566                564                563                562                562                562                561                561                559                558                557                557                557                556                555                554                553                553                552                552                549

> Exercise

Task

Check whether there is still some noise in the data and remove it. Hint: Scan through the topfeatures.

Solution
## remove phi
toks <- tokens_select(toks, pattern = 'φ', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>% 
           dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
                    max_docfreq = 0.1, docfreq_type = "prop")

Scaling: correspondence analysis

## compute model
sfe_ca <- textmodel_ca(dfm_sfe)
## coerce model coefficients to dataframe
sfe_ca <- data.frame(dim1 = coef(sfe_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(sfe_ca, doc_dim = 2)$coef_document)

sfe_ca$id <- gsub('\\.json(\\.[0-9])?', '', rownames(sfe_ca))
sfe_ca
## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 15, seed = 6734) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Full Data')

## plot parts of the data
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=dim1-dim2), alpha = 0.2) +
  # plot 0.2 of all labels, using a repel function
  geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 9, seed = 6734) +
  scale_y_continuous(limits=c(-2,0)) +
  scale_x_continuous(limits=c(-1,1)) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis: Zoom')

Unsupervised LDA

## run naive unsupervised topic model with 10 topics
set.seed(123)
sfe_lda <- textmodel_lda(dfm_sfe, k = 10)
## print top 20 terms per topic
terms(sfe_lda, 20)
##       topic1          topic2         topic3           topic4           topic5        topic6        topic7             topic8         topic9        topic10     
##  [1,] "supervenience" "gene"         "disability"     "turing"         "spacetime"   "ockham"      "chinese"          "heidegger"    "privacy"     "ibn"       
##  [2,] "trope"         "molecular"    "oppression"     "gödel"          "einstein"    "bacon"       "spinoza"          "dewey"        "theism"      "avicenna"  
##  [3,] "monism"        "neural"       "african"        "algebra"        "kuhn"        "scotus"      "reid"             "husserl"      "torture"     "buddhist"  
##  [4,] "fictional"     "simulation"   "racial"         "intuitionistic" "popper"      "pythagoras"  "dao"              "du"           "user"        "maimonides"
##  [5,] "bolzano"       "dna"          "coercion"       "computation"    "reichenbach" "boethius"    "nietzsche"        "malebranche"  "clinical"    "arabic"    
##  [6,] "brentano"      "darwin"       "feminism"       "ordinal"        "weyl"        "pythagorean" "confucian"        "artist"       "whitehead"   "averroes"  
##  [7,] "goodman"       "fitness"      "capability"     "hilbert"        "gravity"     "parmenides"  "sidgwick"         "berlin"       "delusion"    "japanese"  
##  [8,] "physicalism"   "inheritance"  "domination"     "cardinal"       "hole"        "proclus"     "utilitarianism"   "artistic"     "theist"      "buddha"    
##  [9,] "strawson"      "fodor"        "pornography"    "tarski"         "ramsey"      "cicero"      "mohists"          "spinoza"      "theistic"    "buddhism"  
## [10,] "noun"          "ai"           "distributive"   "peirce"         "bayesian"    "philo"       "conscience"       "wolff"        "hartshorne"  "dharma"    
## [11,] "bradley"       "artifact"     "marx"           "algorithm"      "entropy"     "plotinus"    "consequentialist" "nietzsche"    "enhancement" "islamic"   
## [12,] "austin"        "imagery"      "egalitarian"    "algebraic"      "bohr"        "sextus"      "luck"             "herder"       "doxastic"    "indian"    
## [13,] "armstrong"     "biologist"    "dworkin"        "recursive"      "newtonian"   "porphyry"    "laozi"            "clarke"       "omnipotent"  "emptiness" 
## [14,] "chisholm"      "drift"        "coercive"       "brouwer"        "payoff"      "abelard"     "zhuangzi"         "sartre"       "embryo"      "mystical"  
## [15,] "entailment"    "biodiversity" "sovereign"      "provable"       "feyerabend"  "bce"         "zhu"              "romantic"     "suicide"     "nishida"   
## [16,] "plural"        "cancer"       "income"         "zfc"            "dynamical"   "iamblichus"  "thick"            "bois"         "internalism" "japan"     
## [17,] "intension"     "quale"        "constitutional" "computable"     "bell"        "fr"          "hutcheson"        "artwork"      "gratitude"   "zen"       
## [18,] "implicature"   "adaptation"   "liberalism"     "cantor"         "gas"         "luther"      "confucius"        "fichte"       "engine"      "hebrew"    
## [19,] "grice"         "genome"       "anderson"       "diagram"        "mach"        "timaeus"     "relativism"       "schopenhauer" "csm"         "vasubandhu"
## [20,] "meinong"       "digital"      "republican"     "definable"      "bet"         "sophist"     "wang"             "collins"      "goldman"     "al-fārābī"
## plot the topics over the correspondence analysis data
sfe_ca$topics <- topics(sfe_lda)
ggplot(sfe_ca, aes(x=dim1, y=dim2, color=topics)) +
  geom_point(alpha = 0.5, shape = '.') +
  geom_density_2d(alpha = 0.5) +
  theme_bw() +
  theme(plot.title = element_text(face='bold')) +
  labs(title = 'Correspondence Analysis with Topic Annotation (k=10)')

> Exercise

Task

Change the names of the topics (to some meaningful description) before plotting.

Solution
sfe_ca$topics <- recode(sfe_ca$topics, topic1 = "body-mind", topic2 = "biology", 
                        topic3 = "feminism/critical thinking", topic4 = "math/ai", 
                        topic5 = "physics", topic6 = "classics", topic7 = "eastern",
                        topic8 = "phenomenology", topic9 = "religion", 
                        topic10 = "middle-eastern/eastern")

PoS-tagging - leaving the sandbox

## set seed
set.seed(48621)
## draw a random sample of 20 documents
sfe_sub <- sfe[sample(1:length(sfe), 5)]
sfe_sub
## Corpus consisting of 5 documents and 21 docvars.
## albert-saxony.json :
## " Albert of Saxony (ca. 1320–1390), Master of Arts at Paris, ..."
## 
## contractarianism.json.1 :
## " Contractarianism names both a political theory of the legit..."
## 
## preferences.json.1 :
## " The notion of preference has a central role in many discipl..."
## 
## plotinus.json :
## " Plotinus (204/5 – 270 C.E.), is generally regarded as the f..."
## 
## paternalism.json :
## " Paternalism is the interference of a state or an individual..."
## PoS-tagging
sfe_pos <- spacy_parse(sfe_sub, pos = T, tag = T, lemma = T, entity = T, dependency = T)
sfe_pos
## look up which adjectives are used most frequently
sfe_pos %>% 
  filter(pos == 'ADJ') %>% 
  group_by(token) %>% 
  summarise(n.occurences = n()) %>% 
  arrange(desc(n.occurences))
## look up which nouns are preceded by the adjective "rational"
rational_noun <- sfe_pos %>% filter(pos == 'NOUN' & lag(token, 1) == 'rational')
rational_noun
# get top 2 nouns per document
rational_noun %>% 
  group_by(doc_id, token) %>% 
  summarise(n.occurences = n()) %>% 
  arrange(doc_id, desc(n.occurences)) %>% 
  slice(1:2)

Augment your sandbox

## to create a corpus-object from your pos-tagged tokens
## we need unique IDs
sfe_pos
## make doc_ids unique
sfe_pos <- mutate(sfe_pos, doc_id = make.unique(doc_id))
## remove punctuation and spaces
sfe_pos <- filter(sfe_pos, !pos %in% c('PUNCT', 'SPACE'))
## make token corpus
sfe_pos <- corpus(sfe_pos, text_field = 'token', docid_field = 'doc_id')
sfe_pos
## Corpus consisting of 35,214 documents and 8 docvars.
## albert-saxony.json.1 :
## "Albert"
## 
## albert-saxony.json.2 :
## "of"
## 
## albert-saxony.json.3 :
## "Saxony"
## 
## albert-saxony.json.5 :
## "ca"
## 
## albert-saxony.json.6 :
## "."
## 
## albert-saxony.json.7 :
## "1320–1390"
## 
## [ reached max_ndoc ... 35,208 more documents ]
docvars(sfe_pos)
## WARNING! This data-structure is incompatible with our document-based corpus!!!
docvars(sfe_sub)
## ... but we can add the info to our token corpus
# add initial document ID to both sets of docvars
docvars(sfe_pos)$initial_docid <- gsub('\\.json.*', '', docid(sfe_pos))
docvars(sfe_sub)$initial_docid <- gsub('\\.json.*', '', docid(sfe_sub))
# join by initial id
docvars(sfe_pos) <- left_join(docvars(sfe_pos), docvars(sfe_sub), by = 'initial_docid')
docvars(sfe_pos)
## Keep in mind: your corpus is still on token level!

Additional material

Hierarchical clustering

## hierarchical clustering - get distances on normalized dfm
sfe_dist_mat <- dfm_weight(dfm_sfe, scheme = "prop") %>%
    textstat_dist(method = "euclidean") %>% 
    as.dist()
## hiarchical clustering the distance object
sfe_cluster <- hclust(sfe_dist_mat, method = 'ward.D')
# label with document names
sfe_cluster$labels <- gsub('\\.json(\\.[0-9])?', '', docnames(dfm_sfe))
## determine best numbers of clusters
# fviz_nbclust(as.matrix(sfe_dist_mat), FUN = hcut, method = "wss")
## cut tree into four groups
clusters <- cutree(sfe_cluster, k = 4)
## add cluster-data to the correspondence analysis
sfe_ca_hcl <- left_join(sfe_ca, data.frame(cluster = clusters, id = names(clusters)))
## plot
ggplot(sfe_ca_hcl, aes(x=dim1, y=dim2, label=id)) +
  geom_point(aes(color=as.factor(cluster)), alpha = 0.2) +
  facet_grid(~as.factor(cluster))

## hierarchical clustering doesn't provide discrete cluster along
## the dimensions of the correspondance analysis

Cosine similarities for documents

## subset documents about logic
logic <- dfm_subset(dfm_sfe, grepl('(?<=\\-)logic|logic(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarity
logic_sim <- textstat_simil(logic, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .4
as.data.frame(logic_sim) %>% 
  filter(cosine > .4) %>% 
  arrange(desc(cosine))

> Exercise

Task

Redo the cosine similarities for another subset of documents.

Solution
## subset documents about aesthetics
aesth <- dfm_subset(dfm_sfe, grepl('aesthetics', docnames(dfm_sfe), perl = T))
## compute cosine similarity
aesth <- textstat_simil(aesth, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .2
as.data.frame(aesth) %>% 
  filter(cosine > .2) %>% 
  arrange(desc(cosine))

Cosine similarities for features

## subset documents about feminism
fem <- dfm_subset(dfm_sfe, grepl('(?<=\\-)fem|fem.*(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarities for the features 
## "empowerment", "embodiment", and "rape"
fem_sim <- textstat_simil(logic, logic[, c("empowerment", "embodiment", "rape")], 
                          margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>% 
  group_by(feature2) %>% 
  arrange(feature2, desc(cosine)) %>% 
  slice_head(n=5)

> Exercise

Task

Redo the cosine similarities for a different set of features.

Solution
fem_sim <- textstat_simil(logic, logic[, c("feminism", "patriarchy")], 
                          margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>% 
  group_by(feature2) %>% 
  arrange(feature2, desc(cosine)) %>% 
  slice_head(n=5)
 




A work by Lucien Baumgartner

https://lucienbaumgartner.github.io/" class="fa fa-home">